library(tidyverse)
library(magrittr)
library(useful)
library(scales)
library(ggthemes)
library(ggrepel)
library(taigr)
library(ggpubr)
library(uwot)
library(ggpubr)Lecture 3 - Examples
We start by loading the necessary libraries:
Do not forget to specify the taigaclient (please change the path below for your own setup)
# make sure to edit this line:
options(taigaclient.path=path.expand("/Users/mkocak/anaconda3/envs/taigapy/bin/taigaclient"))Additionally if you have a problem with uwot::umap() function, please try to run these two lines (just once) to install Matrix and irlba libraries from their source codes:
install.packages("Matrix", type = "source")
install.packages("irlba", type = "source")Please check out Skyros download page for detailed information and other datasets (e.g. Proteomics, RNAi, miRNA, etc.)
Cell line metadata: This table contains depmap_id’s ccle_name’s and all the relevant cell line information, like OncoTree annotations,
Model <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='Model')Genetic dependencies: Matrices indexed by cell line (by depmap_id) and genes. Files ending with “GeneEffect” are scaled between -1 (median of essential genes) and 0 (median of non-essentials).
“CRISRGeneDependency” matrix contains the dependency probabilities (1 means very likely to be a dependency and 0 means very likely to be a non-dependency)
CRISPRGeneDependency <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='CRISPRGeneDependency')
CRISPRGeneEffect <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='CRISPRGeneEffect')
OrganoidGeneEffect <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OrganoidGeneEffect')
Omics Datasets:
# Copy Number Alterations
OmicsAbsoluteCNGene <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsAbsoluteCNGene')
OmicsArmLevelCNA <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsArmLevelCNA')
OmicsCNGene <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsCNGene')
# Gene Expression (by gene and by gene set)
OmicsExpressionGeneSetEnrichment <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsExpressionGeneSetEnrichment')
OmicsExpressionProteinCodingGenesTPMLogp1 <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsExpressionProteinCodingGenesTPMLogp1')
# Fusions
OmicsFusionFiltered <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsFusionFiltered')
# Mutation table and binary matrices for hotspot and binary mutations
OmicsSomaticMutations <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsSomaticMutations')
OmicsSomaticMutationsMatrixDamaging <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsSomaticMutationsMatrixDamaging')
OmicsSomaticMutationsMatrixHotspot <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsSomaticMutationsMatrixHotspot')
# Loss of Heterozygosity
OmicsLoH <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsLoH')
# Other omics signatures: Aneuploidy, MSI, CIN, etc.
OmicsSignatures <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsSignatures')
# Structural variants
OmicsStructuralVariants <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsStructuralVariants')PRISM Datasets:
# This is the compound meta-data used by the depmap portal, it is a busy table but it can become handy, the following transformations are cleaning and trimming it little bit
compound.metadata <- load.from.taiga(data.name='compound-metadata-de37', data.version=23, data.file='compound_metadata') %>%
dplyr::distinct(Drug.Name,
IDs, MOA,
repurposing_target,
Synonyms) %>%
dplyr::rename(Target = repurposing_target) %>%
tidyr::separate_rows(IDs, sep = ";")
# PRISM Repurposing log2fold-change viabilities (from last year)
Repurposing.Column.Meta <- load.from.taiga(data.name='prism-repurposing-combined-matrix-0c75', data.version=2, data.file='prism_bootcamp')
Repurposing.Matrix <- load.from.taiga(data.name='prism-repurposing-combined-matrix-0c75', data.version=2, data.file='prism_matrix_bootcamp')
# PRISM Oncology Reference Set
OncRef.Compound.List <- load.from.taiga(data.name='prism-oncology-reference-set-23q4-1a7c', data.version=14, data.file='PRISM_Oncology_Reference_23Q4-Compound_List')
OncRef.AUC.matrix <- load.from.taiga(data.name='prism-oncology-reference-set-23q4-1a7c', data.version=13, data.file='AUC_matrix') %>%
t() # Note this needs to be transposed.
OncRef.LFC.Matrix <- load.from.taiga(data.name='prism-oncology-reference-set-23q4-1a7c', data.version=13, data.file='PRISM_Oncology_Reference_23Q4_LFC_Matrix')Miscellanous:
# Methylation per gene
MethylationExpressionImpactBroad <- load.from.taiga(data.name='internal-beta-features-7130', data.version=31, data.file='MethylationExpressionImpactBroad')
MethylationExpressionImpactSanger <- load.from.taiga(data.name='internal-beta-features-7130', data.version=31, data.file='MethylationExpressionImpactSanger')
# Driver alterations (by Mike Burger)
DriverGeneFunctionalAlterations <- load.from.taiga(data.name='internal-beta-features-7130', data.version=31, data.file='DriverGeneFunctionalAlterations')
# Harmonized Expression Dataset : GTEx, TCGA & CCLE (2020, by Kevin Zhang)
harmonized.TCGA.GTEx.CCLE <- load.from.taiga(data.name='harmonized-tcga-gtex-target-and-ccle-baseline-expression-280a', data.version=8, data.file='harmonized_TCGA_GTEx_CCLE')
metadata.TCGA.GTEx.CCLE <- load.from.taiga(data.name='harmonized-tcga-gtex-target-and-ccle-baseline-expression-280a', data.version=8, data.file='metadata_TCGA_GTEx_CCLE')
protein.coding.genes <- load.from.taiga(data.name='harmonized-tcga-gtex-target-and-ccle-baseline-expression-280a', data.version=8, data.file='protein_coding_genes') # little bit outdated
In this vignette, we will start looking into the binary HotSpot mutation matrix and try to find pairs of mutations that appeared more or less frequent than expected.
First, let’s load the relevant data from taiga:
OmicsSomaticMutationsMatrixHotspot <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsSomaticMutationsMatrixHotspot')
# dimensions of the matrix
paste0("Dimensions :" , dim(OmicsSomaticMutationsMatrixHotspot))[1] "Dimensions :2145" "Dimensions :525"
# take a peek at the top left corner of the matrix (to check the column and rownames)
corner(OmicsSomaticMutationsMatrixHotspot) PIK3CD (5293) MTOR (2475) KLHDC7A (127707) ARID1A (8289)
ACH-000062 0 0 0 0
ACH-001949 0 0 0 0
ACH-003131 0 0 0 0
ACH-000402 0 0 0 0
ACH-000693 0 0 0 0
ZSCAN20 (7579)
ACH-000062 0
ACH-001949 0
ACH-003131 0
ACH-000402 0
ACH-000693 0
# simplify the column names
colnames(OmicsSomaticMutationsMatrixHotspot) %<>% word()Next, we will start with a small concrete example: KRAS and EGFR
# Let's pick the relevant columns and put into a data frame (tibble)
EGFR.KRAS.mutations <- OmicsSomaticMutationsMatrixHotspot[, c("EGFR", "KRAS")] %>%
as_tibble() %>%
dplyr::mutate(ModelID = rownames(OmicsSomaticMutationsMatrixHotspot))
# Quick check
EGFR.KRAS.mutations %>%
head()# A tibble: 6 × 3
EGFR KRAS ModelID
<dbl> <dbl> <chr>
1 0 0 ACH-000062
2 0 0 ACH-001949
3 0 0 ACH-003131
4 0 0 ACH-000402
5 0 0 ACH-000693
6 0 0 ACH-000930
# What does 2 mean?
EGFR.KRAS.mutations %>%
dplyr::count(EGFR, KRAS)# A tibble: 5 × 3
EGFR KRAS n
<dbl> <dbl> <int>
1 0 0 1865
2 0 1 198
3 0 2 66
4 1 0 14
5 2 0 2
# Let's keep it simple: 1 for mutant, 0 for not-mutant
EGFR.KRAS.mutations <- EGFR.KRAS.mutations %>%
dplyr::mutate(EGFR = EGFR > 0, KRAS = KRAS > 0)Let’s create a contingency table and apply Fisher’s exact test, check ?fisher.test() for details:
# Is this an implicit NA?
EGFR.KRAS.mutations %>%
dplyr::count(EGFR, KRAS)# A tibble: 3 × 3
EGFR KRAS n
<lgl> <lgl> <int>
1 FALSE FALSE 1865
2 FALSE TRUE 264
3 TRUE FALSE 16
# Do we have enough power to support our claim?
EGFR.KRAS.mutations %>%
dplyr::count(EGFR, KRAS) %>%
tidyr::complete(EGFR,KRAS, fill = list(n = 0)) %>%
.$n %>% matrix(2) %>%
fisher.test()
Fisher's Exact Test for Count Data
data: .
p-value = 0.2453
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.00000 1.84506
sample estimates:
odds ratio
0
Seems like we don’t have enough power to reject the null hypothesis (chance) for the absence of simultaneous KRAS and EGFR hotspot mutations on any of the cell lines.
Let’s look for other pairs of genes systematically. We will first create a co-occurence matrix among genes that has at least 5 mutant cell line among our 2145 lines.
# first let's simplify our mutation matrix for relatively common mutations
# and binarize it
OmicsSomaticMutationsMatrixHotspot <- OmicsSomaticMutationsMatrixHotspot > 0
# and let's drop genes that doesn't have at least 5 mutant cell lines
OmicsSomaticMutationsMatrixHotspot <- OmicsSomaticMutationsMatrixHotspot[, apply(OmicsSomaticMutationsMatrixHotspot, 2, sum) > 4]
# let's get the pairwise co-occurance counts
co.occurences <- crossprod(OmicsSomaticMutationsMatrixHotspot)
# Here is how the co-occurence matrix looks like
corner(co.occurences) PIK3CD MTOR ARID1A NRAS ZNF648
PIK3CD 5 0 0 0 0
MTOR 0 18 2 1 0
ARID1A 0 2 11 0 0
NRAS 0 1 0 99 0
ZNF648 0 0 0 0 5
Next, let’s write a small helper function that takes a slice of the co-occurence matrix given two genes
# total number of samples
n = nrow(OmicsSomaticMutationsMatrixHotspot)
# a helper function to extract the contingency matrix from the co-occurence matrix
contingency_matrix <- function(gene1, gene2) {
# note that we are keeping n and co.occurances matrix as global variables
n4 = co.occurences[gene1, gene2]
n3 = co.occurences[gene2, gene2] - n4
n2 = co.occurences[gene1, gene1] - n4
n1 = n - n2 - n3 - n4
m <- matrix(c(n1,n2,n3,n4),2)
colnames(m) <- paste0(gene2, c("_WT", "_MUT"))
rownames(m) <- paste0(gene1, c("_WT", "_MUT"))
m
}
# same sanity checks
contingency_matrix("EGFR", "KRAS") KRAS_WT KRAS_MUT
EGFR_WT 1865 264
EGFR_MUT 16 0
contingency_matrix("BRAF", "PTEN") PTEN_WT PTEN_MUT
BRAF_WT 1922 70
BRAF_MUT 151 2
Next, we write another function that takes the contingency matrix and returns the results of the Fisher’s exact test (odd’s ratio and p-value):
# another helper to wrap the fisher's test
is_powered_enough<- function(gene1, gene2) {
m <- contingency_matrix(gene1, gene2)
test <- fisher.test(m)
data.frame(odds.ratio = test$estimate, p.val = test$p.value)
}Finally, we list all pairs of the genes and call our new function to have a summary table:
# let's summarize our tests
test.results <- co.occurences %>%
reshape2::melt(varnames = c("gene1", "gene2")) %>% # melt is a common function, note the column types
dplyr::mutate(gene1 = as.character(gene1),
gene2 = as.character(gene2)) %>%
dplyr::select(-value) %>%
dplyr::filter(gene1 < gene2) %>%
dplyr::rowwise() %>% # note this is the first time we are seeing this, please check the documentation
dplyr::mutate(is_powered_enough(gene1, gene2)) %>% # this is a handy trick
dplyr::ungroup() %>%
dplyr::mutate(q.val = p.adjust(p.val, method = "BH")) # correcting for multiple-hypothesis correction, please check out ?p.adjust()
test.results# A tibble: 3,570 × 5
gene1 gene2 odds.ratio p.val q.val
<chr> <chr> <dbl> <dbl> <dbl>
1 MTOR PIK3CD 0 1 1
2 ARID1A PIK3CD 0 1 1
3 NRAS PIK3CD 0 1 1
4 H3-3A PIK3CD 0 1 1
5 GATA3 PIK3CD 0 1 1
6 FGFR2 PIK3CD 0 1 1
7 HRAS PIK3CD 0 1 1
8 CARNS1 PIK3CD 0 1 1
9 CCND1 PIK3CD 0 1 1
10 ATM PIK3CD 0 1 1
# ℹ 3,560 more rows
Finally, we create a volcano plot based on the final table
p <- test.results %>%
dplyr::mutate(highlight = (q.val < 0.2) | (odds.ratio > 300),
control = (gene1 =="EGFR" ) & (gene2 == "KRAS"),
label = paste0(gene1, "/", gene2)) %>%
ggplot(aes(x = log2(odds.ratio),
y = -log10(q.val),
alpha = highlight|control,
color = highlight,
pair = label)) +
geom_point(show.legend = F) +
geom_hline(yintercept = -log10(0.2), lty = 2, lwd = 1) +
geom_text_repel(aes(label = ifelse(highlight | control, label, NA)),
size = 3,
show.legend = F) +
theme_bw() + scale_color_wsj() +
labs(x = "log2(Odds Ratio)", y = "-log10(Q)", title = "Co-Occurence of Hotspot Mutations Accross DepMap Cell Line Panel")
pWarning: Using alpha for a discrete variable is not advised.
Warning: Removed 3417 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Warning: ggrepel: 116 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

You can also quickly create an interactive version (hover over the points to see labels):
plotly::ggplotly(p + theme(legend.position = "none"),
tooltip = c("x", "y", "label"))Warning: Using alpha for a discrete variable is not advised.
Warning in geom2trace.default(dots[[1L]][[3L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
Warning in geom2trace.default(dots[[1L]][[3L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
Warning in geom2trace.default(dots[[1L]][[3L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
Lastly, we can quickly check some of the pairs and create a quick heatmap:
# Some sanity checks
contingency_matrix("POLE", "XPO1") XPO1_WT XPO1_MUT
POLE_WT 2132 6
POLE_MUT 2 5
contingency_matrix("BRAF", "KRAS") KRAS_WT KRAS_MUT
BRAF_WT 1729 263
BRAF_MUT 152 1
contingency_matrix("APC", "KRAS") KRAS_WT KRAS_MUT
APC_WT 1853 236
APC_MUT 28 28
contingency_matrix("KRAS", "NRAS") NRAS_WT NRAS_MUT
KRAS_WT 1783 98
KRAS_MUT 263 1
# Two of the most common mutations
contingency_matrix("TP53", "KRAS") KRAS_WT KRAS_MUT
TP53_WT 1064 108
TP53_MUT 817 156
is_powered_enough("TP53", "KRAS") odds.ratio p.val
odds ratio 1.880608 1.855958e-06
# A quick heatmap
short.list <- c("KRAS", "NRAS", "BRAF", "EGFR", "TP53", "PIK3R1", "APC", "POLE", "XPO1")
co.occurences[short.list, short.list] KRAS NRAS BRAF EGFR TP53 PIK3R1 APC POLE XPO1
KRAS 264 1 1 0 156 2 28 2 2
NRAS 1 99 4 0 32 0 1 0 0
BRAF 1 4 153 0 39 0 5 0 0
EGFR 0 0 0 16 8 0 0 0 0
TP53 156 32 39 8 973 11 40 5 4
PIK3R1 2 0 0 0 11 15 3 3 3
APC 28 1 5 0 40 3 56 5 3
POLE 2 0 0 0 5 3 5 7 5
XPO1 2 0 0 0 4 3 3 5 11
pheatmap::pheatmap(co.occurences[short.list, short.list],
display_numbers = TRUE)
# This is admittedly primitive!
# Think about how you can improve it? Our analysis here was admittedly primitive, please think about how can you improve it?
Also, let’s remove the mutation and co-occurence matrices we created from memory before moving on to the next vignette.
rm(OmicsSomaticMutationsMatrixHotspot, co.occurences)In this vignette, we will pull compounds targeting MDM2 from Repurposing Primary (single dose at 2.5uM) and examine their response against MDM2 expression and TP53 status.
First, let’s download expression and mutation datasets, along with Repurposing data.
OmicsExpressionProteinCodingGenesTPMLogp1 <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsExpressionProteinCodingGenesTPMLogp1')
OmicsSomaticMutationsMatrixDamaging <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsSomaticMutationsMatrixDamaging')
# simplify the column names
colnames(OmicsExpressionProteinCodingGenesTPMLogp1) %<>% word()
colnames(OmicsSomaticMutationsMatrixDamaging) %<>% word()
# PRISM Repurposing log2fold-change viabilities (from last year)
Repurposing.Column.Meta <- load.from.taiga(data.name='prism-repurposing-combined-matrix-0c75', data.version=2, data.file='prism_bootcamp')
Repurposing.Matrix <- load.from.taiga(data.name='prism-repurposing-combined-matrix-0c75', data.version=2, data.file='prism_matrix_bootcamp')Next, let’s pull compounds of interest and the relevant columns of the omics data. We will organize them into a single data frame before visualization
# list of MDM2 inhibitors in Rep. Primary dataset
MDM2.inhibitors <- Repurposing.Column.Meta %>%
dplyr::filter(str_detect(Target, "MDM2"),
screen == "REP.PRIMARY")
# tidying the PRISM data for the compounds, MDM2 expression and TP53 mutation status
MDM2i.data <- Repurposing.Matrix[, MDM2.inhibitors$column_name] %>%
reshape2::melt(varnames = c("ModelID", "column_name"), value.name = "LFC") %>%
dplyr::filter(is.finite(LFC)) %>%
dplyr::left_join(MDM2.inhibitors) %>%
dplyr::inner_join(tibble(ModelID = rownames(OmicsSomaticMutationsMatrixDamaging),
TP53.Dam.Mutation = OmicsSomaticMutationsMatrixDamaging[,"TP53"])) %>%
dplyr::inner_join(tibble(ModelID = rownames(OmicsExpressionProteinCodingGenesTPMLogp1),
MDM2.Expression = OmicsExpressionProteinCodingGenesTPMLogp1[,"MDM2"]))Joining with `by = join_by(column_name)`
Joining with `by = join_by(ModelID)`
Joining with `by = join_by(ModelID)`
MDM2i.data %>%
head() ModelID column_name LFC screen
1 ACH-000001 BRD-A12230535-001-06-7::2.5::HTS -0.10925255 REP.PRIMARY
2 ACH-000007 BRD-A12230535-001-06-7::2.5::HTS 0.04379274 REP.PRIMARY
3 ACH-000008 BRD-A12230535-001-06-7::2.5::HTS -1.16365502 REP.PRIMARY
4 ACH-000011 BRD-A12230535-001-06-7::2.5::HTS -0.46752394 REP.PRIMARY
5 ACH-000013 BRD-A12230535-001-06-7::2.5::HTS -0.21656948 REP.PRIMARY
6 ACH-000014 BRD-A12230535-001-06-7::2.5::HTS -2.36298223 REP.PRIMARY
brd dose Name Target MOA TP53.Dam.Mutation
1 BRD-A12230535 2.5 NUTLIN-3 MDM2, TP53 MDM INHIBITOR 2
2 BRD-A12230535 2.5 NUTLIN-3 MDM2, TP53 MDM INHIBITOR 0
3 BRD-A12230535 2.5 NUTLIN-3 MDM2, TP53 MDM INHIBITOR 0
4 BRD-A12230535 2.5 NUTLIN-3 MDM2, TP53 MDM INHIBITOR 0
5 BRD-A12230535 2.5 NUTLIN-3 MDM2, TP53 MDM INHIBITOR 2
6 BRD-A12230535 2.5 NUTLIN-3 MDM2, TP53 MDM INHIBITOR 0
MDM2.Expression
1 4.208673
2 5.155830
3 6.167719
4 4.989593
5 4.145677
6 6.683416
Finally, let’s create a faceted scatter plot, please think about alternative ways of plotting this data:
MDM2i.data %>%
dplyr::mutate(TP53 = ifelse(TP53.Dam.Mutation, "Mutant", "WT"),
FC = pmin(1,2^LFC)) %>%
ggplot(aes(x = MDM2.Expression, y = FC,
color = TP53, shape = TP53)) +
geom_point(size = 1, alpha = .2) +
stat_cor(show.legend = F, size = 3, label.x = 5) +
geom_smooth(method = "lm", show.legend = TRUE, se = FALSE, lwd = .5) +
facet_wrap(Name ~ .) +
labs(color = "TP53 Status", shape = "TP53 Status",
x = "MDM2 Expression - log2(1+TPM)", y = "Fold Change Viability") +
theme_base(base_size = 12) + scale_color_wsj() +
theme(legend.position = c(0.8, 0.1)) +
coord_cartesian(ylim =c(0,1)) +
scale_shape_manual(values = c(1,4))Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
3.5.0.
ℹ Please use the `legend.position.inside` argument of `theme()` instead.
`geom_smooth()` using formula = 'y ~ x'

Let’s remove the big datasets from our environment.
rm(OmicsExpressionProteinCodingGenesTPMLogp1, OmicsSomaticMutationsMatrixDamaging, Repurposing.Column.Meta, Repurposing.Matrix)This time we will create a basic correlation-based UMAP of Repurposing Primary dataset where each compound will be representing a compound, and highlight MDM2 inhibitors on the UMAP
# load the PRISM data
Repurposing.Column.Meta <- load.from.taiga(data.name='prism-repurposing-combined-matrix-0c75', data.version=2, data.file='prism_bootcamp')
Repurposing.Matrix <- load.from.taiga(data.name='prism-repurposing-combined-matrix-0c75', data.version=2, data.file='prism_matrix_bootcamp')
# Choose the columns that corresponds to the PRIMARY dataset
primary.profiles <- Repurposing.Column.Meta %>%
dplyr::filter(screen == "REP.PRIMARY") %>%
.$column_name
PRISM.primary <- Repurposing.Matrix[,primary.profiles]Next, create a correlation matrix among compounds
# use = "p" tells us to use all "pairwise complete observations", please see ?cor()
C <- cor(PRISM.primary, use = "p")
corner(C) BRD-A00077618-236-07-6::2.5::HTS
BRD-A00077618-236-07-6::2.5::HTS 1.000000000
BRD-A00100033-001-08-9::2.5::HTS 0.026813698
BRD-A00147595-001-01-5::2.5::HTS 0.047456175
BRD-A00218260-001-03-4::2.5::HTS 0.025641227
BRD-A00376169-001-01-6::2.5::HTS 0.002923544
BRD-A00100033-001-08-9::2.5::HTS
BRD-A00077618-236-07-6::2.5::HTS 0.02681370
BRD-A00100033-001-08-9::2.5::HTS 1.00000000
BRD-A00147595-001-01-5::2.5::HTS 0.00724496
BRD-A00218260-001-03-4::2.5::HTS 0.02053661
BRD-A00376169-001-01-6::2.5::HTS 0.08125067
BRD-A00147595-001-01-5::2.5::HTS
BRD-A00077618-236-07-6::2.5::HTS 0.04745618
BRD-A00100033-001-08-9::2.5::HTS 0.00724496
BRD-A00147595-001-01-5::2.5::HTS 1.00000000
BRD-A00218260-001-03-4::2.5::HTS 0.01181238
BRD-A00376169-001-01-6::2.5::HTS 0.04036201
BRD-A00218260-001-03-4::2.5::HTS
BRD-A00077618-236-07-6::2.5::HTS 0.02564123
BRD-A00100033-001-08-9::2.5::HTS 0.02053661
BRD-A00147595-001-01-5::2.5::HTS 0.01181238
BRD-A00218260-001-03-4::2.5::HTS 1.00000000
BRD-A00376169-001-01-6::2.5::HTS 0.02655052
BRD-A00376169-001-01-6::2.5::HTS
BRD-A00077618-236-07-6::2.5::HTS 0.002923544
BRD-A00100033-001-08-9::2.5::HTS 0.081250665
BRD-A00147595-001-01-5::2.5::HTS 0.040362008
BRD-A00218260-001-03-4::2.5::HTS 0.026550520
BRD-A00376169-001-01-6::2.5::HTS 1.000000000
Create a umap with default parameters: (Note if the umap function gives an error please re-install “Matrix” and “irlba” libraries from their sources using the lines from the introduction)
# Please check out the other parameters, especially n_neighbors is a critical one
umap.data <- uwot::umap(as.dist(1 - C), min_dist = 0, n_neighbors = 7)
umap.data %<>%
as.tibble() %>%
dplyr::mutate(column_name = rownames(umap.data)) %>%
dplyr::left_join(Repurposing.Column.Meta)Warning: `as.tibble()` was deprecated in tibble 2.0.0.
ℹ Please use `as_tibble()` instead.
ℹ The signature and semantics have changed, see `?as_tibble`.
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
ℹ The deprecated feature was likely used in the tibble package.
Please report the issue at <https://github.com/tidyverse/tibble/issues>.
Joining with `by = join_by(column_name)`
Let’s create some visuals using the umap table
# A static scatter plot
umap.data %>%
dplyr::mutate(highlight = str_detect(Target, "MDM2")) %>%
ggplot(aes(x = V1, y = V2,
alpha = highlight,
color = highlight)) +
geom_point(size = 1 , shape = 1, show.legend = F) +
geom_text_repel(aes(label = ifelse(highlight, Name, NA)),
size = 2, max.overlaps = 20, show.legend = F) +
theme_few() + scale_color_wsj() +
scale_alpha_manual(values = c(.3,1)) +
labs(x = "UMAP1", y = "UMAP2",
title = "MDM2 Inhibitors in PRISM Repurposing (Primary)")Warning: Removed 1302 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 5264 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

# An interactive one
p <- umap.data %>%
as.tibble() %>%
dplyr::mutate(column_name = rownames(umap.data)) %>%
dplyr::left_join(Repurposing.Column.Meta) %>%
ggplot(aes(x = V1, y = V2,
name = Name,
moa = MOA,
target = Target)) +
geom_point(size = .5 , alpha = .5, shape = 1, show.legend = F) +
theme_few() + scale_color_fivethirtyeight() Joining with `by = join_by(column_name, screen, brd, dose, Name, Target, MOA)`
plotly::ggplotly(p)# Exercise:
# What about the most common 8 MOA's?
# Should we filter our pan-toxic or inert compounds before we start? If so
# how would you do that?As an exercise try following:
Can you overlay the toxicity of each compoundas colors? (You can define toxicity as how many cell lines has LFC < -2)
Which are the most common 8 MOA’s in this dataset?
Repeat the same exercise by removing pan-toxic or inert compounds before we calculate the UMAP coordinates.
In this vignette we will run a quick correlation analysis and represent the results using waterfall and volcano plots.
As usual, we start with downloading and organizing our data: Idasanutlin profile from PRISM Repurposing Primary and DepMap expression data.
# expression
OmicsExpressionProteinCodingGenesTPMLogp1 <- load.from.taiga(data.name='internal-23q4-ac2b', data.version=68, data.file='OmicsExpressionProteinCodingGenesTPMLogp1')
colnames(OmicsExpressionProteinCodingGenesTPMLogp1) %<>% word()
# PRISM Repurposing
Repurposing.Column.Meta <- load.from.taiga(data.name='prism-repurposing-combined-matrix-0c75', data.version=2, data.file='prism_bootcamp')
Repurposing.Matrix <- load.from.taiga(data.name='prism-repurposing-combined-matrix-0c75', data.version=2, data.file='prism_matrix_bootcamp')
X = OmicsExpressionProteinCodingGenesTPMLogp1
y = Repurposing.Matrix[,"BRD-K62627508-001-01-5::2.5::HTS"] # Check the meta-data to see what is this compound?Next, we write a simple function that computes the correlations between each column of X and y; along with the corresponding p values. If you have WGCNA package installed on your R environment, you can also use WGCNA::corAndPValue() function too.
# Here we define a simple corAndPValue function, note that the rownames of X and names of y should be coming from same set.
corAndPValue <- function(X, y){
y <- y[is.finite(y)]
cl = intersect(rownames(X), names(y))
r = c(cor(X[cl, ], y[cl], use = "p"))
n = apply(X[cl, ] * y[cl],2,function(x) sum(is.finite(x)))
z = r * sqrt(n-2) / sqrt(1-r^2)
p= pt(abs(z), n-2, lower.tail = FALSE)
data.frame(cor =r,
p.value = p,
q.value = p.adjust(p, method = "BH"))
}Here we call our new function on and create a simple volcano plot. I am leaving decorating this plot as an exercise.
result <- corAndPValue(X,y) %>%
dplyr::mutate(gene = colnames(X))Warning in cor(X[cl, ], y[cl], use = "p"): the standard deviation is zero
result %>%
ggplot() +
geom_point(aes(x = cor, y = -log10(q.value)))Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_point()`).

Finally, let’s finish with a waterfall plot (ranks vs correlation):
result %>%
dplyr::filter(!is.na(cor)) %>%
dplyr::arrange(desc(cor)) %>%
dplyr::mutate(rank = 1:n(), n = n()) %>%
dplyr::mutate(highlight = (rank <= 5) | (rank > (n-10)) )%>%
ggplot(aes(x = rank, y= cor, alpha = highlight)) +
geom_point(aes(size = highlight), shape = 1, show.legend = F) +
geom_text_repel(aes(label = ifelse(highlight, gene, NA)),
size = 2, show.legend = F, max.overlaps = 20) +
scale_size_manual(values = c(.5,3)) +
theme_base(base_size = 12, base_family = "GillSans") +
labs(x = "Rank", y = "Pearson Correlation",
title = "Expression Correlates of Idasanutlin")Warning: Using alpha for a discrete variable is not advised.
Warning: Removed 19172 rows containing missing values or values outside the scale range
(`geom_text_repel()`).

rm(X,y, Repurposing.Column.Meta, Repurposing.Matrix, OmicsExpressionProteinCodingGenesTPMLogp1)Here we will take a high level look at the PRISM Oncology Reference Set.
First let’s load the AUC (normalized area under the dose-response curve) matrix and the compound metadata.
OncRef.AUC.matrix <- load.from.taiga(data.name='prism-oncology-reference-set-23q4-1a7c', data.version=13, data.file='AUC_matrix') %>%
t()
compound.metadata <- load.from.taiga(data.name='compound-metadata-de37', data.version=23, data.file='compound_metadata') %>%
dplyr::distinct(Drug.Name,
IDs, MOA,
repurposing_target,
Synonyms) %>%
dplyr::rename(Target = repurposing_target) %>%
tidyr::separate_rows(IDs, sep = ";")
# let's replace column names of AUC matrix with drug names
colnames(OncRef.AUC.matrix) <- tibble(colnames = colnames(OncRef.AUC.matrix)) %>%
dplyr::left_join(compound.metadata %>%
dplyr::distinct(Drug.Name, IDs),
by = c("colnames" = "IDs")) %>%
.$Drug.Name
OncRef.AUC.matrix %>%
corner CYCLOPHOSPHAMIDE DEXAMETHASONE NUTLIN-3 LENALIDOMIDE PREDNISOLONE
ACH-000001 0.9688078 0.9925929 0.9961162 0.8695128 0.9248852
ACH-000002 NA NA 0.9642756 1.0000000 NA
ACH-000004 0.9675471 1.0000000 0.9471196 0.9982972 1.0000000
ACH-000005 0.8882465 1.0000000 0.9059118 0.9717313 1.0000000
ACH-000006 1.0000000 1.0000000 0.9651450 0.9134056 1.0000000
Let’s use the same method we learned in Vignette 3 to create a quick umap:
OncRef.AUC.Cor <- cor(OncRef.AUC.matrix, use = "p")
OncRef.umap <- uwot::umap(as.dist(1-OncRef.AUC.Cor),
min_dist = 0, n_neighbors = 2) Next, we will cluster our compounds into C (= 30 below) clusters using the distances on the umap first and then using the correlation matrix we computed before umap. Check out ?hclust() for hiearchical clustering.
# clustering compounds into C clusters
C = 30
umap_clusters = OncRef.umap %>%
dist() %>%
hclust() %>%
cutree(k = C)
correlation_clusters <- as.dist(1-OncRef.AUC.Cor) %>%
hclust() %>%
cutree(k = C)
# gather clustering indices, umap locations and meta-data in a single table
OncRef.umap <- OncRef.umap %>%
as_tibble() %>%
dplyr::mutate(Drug.Name = colnames(OncRef.AUC.matrix)) %>%
dplyr::left_join(compound.metadata %>%
dplyr::select(-IDs) %>%
dplyr::distinct()) %>%
dplyr::left_join(tibble(Drug.Name = names(umap_clusters),
umap_cluster_id = umap_clusters)) %>%
dplyr::left_join(tibble(Drug.Name = names(correlation_clusters),
correlation_cluster_id = correlation_clusters))Joining with `by = join_by(Drug.Name)`
Joining with `by = join_by(Drug.Name)`
Warning in dplyr::left_join(., tibble(Drug.Name = names(umap_clusters), : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 54 of `x` matches multiple rows in `y`.
ℹ Row 74 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
Joining with `by = join_by(Drug.Name)`
Warning in dplyr::left_join(., tibble(Drug.Name = names(correlation_clusters), : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 54 of `x` matches multiple rows in `y`.
ℹ Row 54 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
OncRef.umap %>%
head# A tibble: 6 × 8
V1 V2 Drug.Name MOA Target Synonyms umap_cluster_id
<dbl> <dbl> <chr> <chr> <chr> <chr> <int>
1 3.06 -2.16 CYCLOPHOSPHAMIDE DNA alkylating… <NA> CYCLOPH… 1
2 -0.0252 6.12 DEXAMETHASONE agonist of GR … NR3C1 <NA> 2
3 2.65 8.19 NUTLIN-3 inhibitor of T… TP53;… NUTLIN-… 3
4 0.777 -6.18 LENALIDOMIDE degrader of IK… CUL4A… CC-5013… 4
5 -0.0251 6.12 PREDNISOLONE agonist of GR … NR3C1 <NA> 2
6 -5.92 -2.77 CERALASERTIB inhibitor of A… ATR AZD 673… 5
# ℹ 1 more variable: correlation_cluster_id <int>
Let’s plot the umap and overlay both types of clusters.
p1 = OncRef.umap %>%
ggplot() +
geom_point(aes(x = V1,
y = V2,
Drug.Name = Drug.Name,
MOA = MOA,
color = as.factor(umap_cluster_id))) +
labs(color = "UMAP Clusters")Warning in geom_point(aes(x = V1, y = V2, Drug.Name = Drug.Name, MOA = MOA, :
Ignoring unknown aesthetics: Drug.Name and MOA
p2 = OncRef.umap %>%
ggplot() +
geom_point(aes(x = V1,
y = V2,
Drug.Name = Drug.Name,
MOA = MOA,
color = as.factor(correlation_cluster_id))) +
labs(color = "Correlation Clusters")Warning in geom_point(aes(x = V1, y = V2, Drug.Name = Drug.Name, MOA = MOA, :
Ignoring unknown aesthetics: Drug.Name and MOA
cowplot::plot_grid(p1,p2, nrow = 1)
Interactive version (try to add both cluster indices on to the tooltip):
plotly::ggplotly(p1)As we saw before, umaps are not perfect representations of correlation matrices. For small enough matrices, we can use heatmaps instead. See the two examples below:
OncRef.AUC.Cor %>%
pheatmap::pheatmap(show_rownames = FALSE,
treeheight_row = 0,
fontsize_col = 6,
color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
heatmaply::heatmaply(OncRef.AUC.Cor,
show_dendrogram = c(TRUE, FALSE),
xlab = "", ylab = "",
symm = TRUE,
#scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(
# high = "firebrick", mid="white",low = "navy", midpoint = 0,
# limits = c(-1, 1)),
showticklabels = c(FALSE,FALSE),
heatmap_layers = theme(axis.line=element_blank())
)Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
Referenced from: <7A10C056-E1B5-3B12-A4C4-BC4C6FE72F15> /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/modules/R_X11.so
Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libSM.6.dylib' (no such file), '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-11.0.18+10/Contents/Home/lib/server/libSM.6.dylib' (no such file)
Warning in fix_not_all_unique(rownames(x)): Not all the values are unique -
manually added prefix numbers
Warning in fix_not_all_unique(colnames(x)): Not all the values are unique -
manually added prefix numbers
Please try to include some annotations (check annotation_row and annotation_col arguments for pheatmap() function and see the examples in the documentation).
Also for more sophisticated heatmaps please see ComplexHeatmap package here.